from transformers import AutoModelForCausalLM, AutoTokenizer

model_path='/home/ubuntu/jack/model_cache/LLM-Research/Llama-3.2-1B-Instruct'

DEVICE = "cuda:1" #指定设备

model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype="auto",device_map="auto")
# print(model)

tokenizer = AutoTokenizer.from_pretrained(model_path) #加载本地模型
print(tokenizer)

#调用模型
#定义提示词
prompt = "你好，请介绍下你自己。"
#将提示词封装成message
message = [{"role":"system","content":"You are a helpful assistant system"},{"role":"user","content":prompt}]
#使用分词器的apply_chat_template()方法将上面定义的消息列表进行转换;tokenize=False表示此时不进行令牌化
text = tokenizer.apply_chat_template(message,tokenize=False,add_generation_prompt=True)

#将处理后的文本令牌化并转换为模型的输入张量
model_inputs = tokenizer([text],return_tensors="pt").to(DEVICE)
#输入模型得到输出
generated = model.generate(model_inputs.input_ids,max_new_tokens=512)
print(generated)

#对输出的内容进行解码还原
responce = tokenizer.batch_decode(generated,skip_special_tokens=True)
print(responce)
